Intro

In this document, we will analyse RNAseq data (counts at gene level from salmon) using DESeq2. The data comes from 6 populations (of 2 geographic regions) and each population was subjected to benign conditions (constant high water level) and stressful conditions (low water level).

Prep environment

Set working directory and load libraries

#setwd("~/Desktop/rnaseq/")
setwd("~/Documents/students_MSc/clara/rnaseq/")

library(DESeq2)
## Loading required package: S4Vectors
## Loading required package: stats4
## Loading required package: BiocGenerics
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, aperm, append, as.data.frame, basename, cbind,
##     colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
##     get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
##     match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
##     Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
##     table, tapply, union, unique, unsplit, which.max, which.min
## 
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname
## Loading required package: IRanges
## Loading required package: GenomicRanges
## Loading required package: GenomeInfoDb
## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
## 
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
## 
##     colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
##     colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
##     colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
##     colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
##     colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
##     colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
##     colWeightedMeans, colWeightedMedians, colWeightedSds,
##     colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
##     rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
##     rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
##     rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
##     rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
##     rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
##     rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
##     rowWeightedSds, rowWeightedVars
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## 
## Attaching package: 'Biobase'
## The following object is masked from 'package:MatrixGenerics':
## 
##     rowMedians
## The following objects are masked from 'package:matrixStats':
## 
##     anyMissing, rowMedians
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.0     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ lubridate::%within%() masks IRanges::%within%()
## ✖ dplyr::collapse()     masks IRanges::collapse()
## ✖ dplyr::combine()      masks Biobase::combine(), BiocGenerics::combine()
## ✖ dplyr::count()        masks matrixStats::count()
## ✖ dplyr::desc()         masks IRanges::desc()
## ✖ tidyr::expand()       masks S4Vectors::expand()
## ✖ dplyr::filter()       masks stats::filter()
## ✖ dplyr::first()        masks S4Vectors::first()
## ✖ dplyr::lag()          masks stats::lag()
## ✖ ggplot2::Position()   masks BiocGenerics::Position(), base::Position()
## ✖ purrr::reduce()       masks GenomicRanges::reduce(), IRanges::reduce()
## ✖ dplyr::rename()       masks S4Vectors::rename()
## ✖ lubridate::second()   masks S4Vectors::second()
## ✖ lubridate::second<-() masks S4Vectors::second<-()
## ✖ dplyr::slice()        masks IRanges::slice()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(fdrtool)
library(UpSetR)
library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2

Import read counts

Import counts prepared with tximport and clean

txi<-readRDS("./salmon_gene_counts.rds")

Remove non-coding regions and baits

# remove mtDNA, non-coding and nr baits

# make a list of genes we want to keep
whitelist<-txi$counts %>%
  as_tibble(rownames = "gene_id") %>%
  filter(!str_detect(gene_id, pattern = "mt|nr|nc")) %>%
  pull(gene_id)

length(whitelist);head(whitelist) # we are keeping 32531 genes
## [1] 32531
## [1] "PECUL23A000004" "PECUL23A000005" "PECUL23A000006" "PECUL23A000008"
## [5] "PECUL23A000011" "PECUL23A000012"
# filter txi tables
txi$abundance<-txi$abundance[whitelist,]
txi$counts<-txi$counts[whitelist,]
txi$length<-txi$length[whitelist,]

Load design matrix

Load the “column data” for the dds object.

### load design matrix
des.mat<-read_csv("./design_matrix.csv")
## Rows: 48 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): sample_id, treatment, population, region
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# re order factor levels
des.mat <- des.mat %>%
  mutate(population=factor(population, levels=c("Bui","Can","Tur","Esp","Jab","Lla"))) %>% # re-order factors for easy plotting later
  mutate(pop_n = factor(rep(rep(1:3,each=8),2))) %>% # make new, non-nested pop variable !!! BECAREFUL WITH THE ORDER OF VARIABLES
  mutate_if(is.character, as.factor) # convert characters to factor


# filter samples that had substantially lower library size (Jab) and a potential outlier (Bui)
des.mat<-des.mat %>%
  filter(!sample_id %in% c("Bui4H14_nonrrna","Jab5H6_nonrrna")) # could also remove: "Tur2H6_nonrrna","Bui1L9_nonrrna"

# filter txi tables
txi$abundance<-txi$abundance[,as.character(des.mat$sample_id)]
txi$counts<-txi$counts[,as.character(des.mat$sample_id)]
txi$length<-txi$length[,as.character(des.mat$sample_id)]

## get column order of counts matrix and re-order des.mat to match
col_order<-match(colnames(txi$counts),des.mat$sample_id)
des.mat<-des.mat[col_order,]
des.mat$sample_id==colnames(txi$counts)
##  [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [16] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [31] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [46] TRUE
des.mat
## # A tibble: 46 × 5
##    sample_id       treatment population region  pop_n
##    <fct>           <fct>     <fct>      <fct>   <fct>
##  1 Bui1H9_nonrrna  H         Bui        central 1    
##  2 Bui1L9_nonrrna  L         Bui        central 1    
##  3 Bui2H12_nonrrna H         Bui        central 1    
##  4 Bui2L1_nonrrna  L         Bui        central 1    
##  5 Bui3H10_nonrrna H         Bui        central 1    
##  6 Bui3L8_nonrrna  L         Bui        central 1    
##  7 Bui4L4_nonrrna  L         Bui        central 1    
##  8 Can1H4_nonrrna  H         Can        central 2    
##  9 Can1L1_nonrrna  L         Can        central 2    
## 10 Can2H12_nonrrna H         Can        central 2    
## # ℹ 36 more rows

Make DESeq objects

Make deseq object. Our primary object is to test what the effect of dropping the water level on tadpoles results is on tadpoles from the south or central populations. We therefore want to contrast high vs. low water for each of the regions and so we have to include a region:treatment interaction effect. However, we are expecting region alone to be an important factor affecting gene expression (almost like a batch effect) and we also want to correct for potential minor differences of having sampled different populations within each regions. See a similar setup here: https://support.bioconductor.org/p/87324/.

Here, the formula is constructed in such a way that populations are treated as replicates per region. In order for this to work, we have to re-code the populations as 1 to 3 within each region (see chunk above). Otherwise, each population/replicate is unique to each region and will throw back an error.

We drop the intercept (first term in the formula is zero), because this allows us to more easily compare contrasts. If not, the base level expression would be that of the intercept, which is more difficult to intepret for such a complex model design. Read more here: https://www.biostars.org/p/395926/

dds<-DESeqDataSetFromTximport(txi,
                              colData = des.mat,
                              design = ~ 0+region + region:pop_n + region:treatment)
## using counts and average transcript lengths from tximport
# make sure metadata is in matching order
des.mat$sample_id==colnames(assay(dds))
##  [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [16] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [31] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [46] TRUE

Pre filtering

This is an ambiguous step. It may be worth playing around with how this affects the final outcome.

rowSums(counts(dds)) %>%
          enframe() %>%
  ggplot(aes(x=value)) +
  geom_histogram() +
  scale_x_log10()
## Warning in scale_x_log10(): log-10 transformation introduced infinite values.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 8426 rows containing non-finite outside the scale range
## (`stat_bin()`).

# lets apply 2 different filters

dds1<-dds[rowSums(counts(dds) >= 1) >= 12,] # genes that have at least 1 count for 12 samples (e.g. one per treatment per population)
dim(dds1) # we are left with ~18k genes 
## [1] 17987    46
dds2<-dds[rowSums(counts(dds) >= 50) >= 12,] # genes that have at least 10 counts for 12 samples (e.g. one per treatment per population)
dim(dds2) # we are left with ~14k genes 
## [1] 11657    46

Data exploration

Its always a good idea to plot a PCA of the counts data.

Although for the actual DE expression we use raw count data, for visualization or clustering, it makes more sense to use some sort of transformed count. DESeq authors argue that variance stabilized transformation (VST) or regularized logarithm (rlog) have certain advantages. VST is faster, i.e. good for large datasets, rlog is more often used for small datasets. lets do vst

# vst transformation
vst_dds<-vst(dds, blind = T) # no filtering
## using 'avgTxLength' from assays(dds), correcting for library size
vst_dds1<-vst(dds1, blind = T) # mild filtering
## using 'avgTxLength' from assays(dds), correcting for library size
vst_dds2<-vst(dds2, blind = T) # hard filtering
## using 'avgTxLength' from assays(dds), correcting for library size
# perform PCA on TRANSPOSED scaled, centered data
vst_pca<- prcomp(t(assay(vst_dds)),  center = T)
vst_pca1<- prcomp(t(assay(vst_dds1)), center = T)
vst_pca2<- prcomp(t(assay(vst_dds2)), center = T)

To plot multiple axes at once, I am going to use paired plots. The densities on the diagonal are also useful for seeing which PC does the best job at separating out treatments.

PCA emphasizing regions

vst_pca$x %>%
  as_tibble(rownames = "sample_id") %>%
  left_join(des.mat) %>% # add the experimental design information
  ggpairs(columns = c("PC1", "PC2", "PC3", "PC4","PC5","PC6"),  aes(color=region, shape=treatment)) +
  ggtitle("VST PCA (no filtering)")
## Joining with `by = join_by(sample_id)`
## Warning in geom_point(): All aesthetics have length 1, but the data has 36 rows.
## ℹ Did you mean to use `annotate()`?

vst_pca1$x %>%
  as_tibble(rownames = "sample_id") %>%
  left_join(des.mat) %>% # add the experimental design information
  ggpairs(columns = c("PC1", "PC2", "PC3", "PC4","PC5","PC6"),  aes(color=region, shape=treatment)) +
  ggtitle("VST PCA (mild liftering)")
## Joining with `by = join_by(sample_id)`
## Warning in geom_point(): All aesthetics have length 1, but the data has 36 rows.
## ℹ Did you mean to use `annotate()`?

vst_pca2$x %>%
  as_tibble(rownames = "sample_id") %>%
  left_join(des.mat) %>% # add the experimental design information
  ggpairs(columns = c("PC1", "PC2", "PC3", "PC4","PC5","PC6"),  aes(color=region, shape=treatment)) +
  ggtitle("VST PCA (hard liftering)")
## Joining with `by = join_by(sample_id)`
## Warning in geom_point(): All aesthetics have length 1, but the data has 36 rows.
## ℹ Did you mean to use `annotate()`?

#vst_pca2$x[,colnames(vst_pca$x)=="PC5"]<(-40)

# using canned function from DESeq2
#plotPCA(vst_dds, intgroup=c("region"), ntop=Inf) +
#  coord_fixed()

Here we can quite clearly see that the strongest effect is that of the region. PC1 separates them out quite neatly.

Filtering doesn’t have a noticeable effect.

PCA emphasizing treatment

vst_pca1$x %>%
  as_tibble(rownames = "sample_id") %>%
  left_join(des.mat) %>% # add the experimental design information
  ggpairs(columns = c("PC1", "PC2", "PC3", "PC4","PC5","PC6"),  aes(color=treatment, shape=region)) +
  ggtitle("VST PCA")
## Joining with `by = join_by(sample_id)`
## Warning in geom_point(): All aesthetics have length 1, but the data has 36 rows.
## ℹ Did you mean to use `annotate()`?

Plotting the same PCA, but colouring treatments rather than regions, shows that we have to go down to PC3 to start getting treatment separations. PC1 vs PC3 may give us the best separation of both region and treatment.

We can plot specific axes again to highlight some of this more clearly.

# calculate hulls
pca_hull <- 
  vst_pca1$x %>% 
  as_tibble(rownames = "sample_id") %>%
  left_join(des.mat) %>%
  group_by(population, treatment) %>% 
  dplyr::slice(chull(PC1, PC3))
## Joining with `by = join_by(sample_id)`
# plot PCA and facet into different populations
vst_pca1$x %>%
  as_tibble(rownames = "sample_id") %>%
  left_join(des.mat) %>% # add the experimental design information
  ggplot(aes(x=PC1, y=PC3, color=population, shape=treatment)) +
  geom_hline(yintercept = 0, linewidth=0.1) +
  geom_vline(xintercept = 0, linewidth=0.1) +
  geom_point(size=3) +
  geom_polygon(data = pca_hull,
               aes(fill = population,
                   colour = population,
                   linetype=treatment),
               alpha = 0.1,
               show.legend = FALSE) +
  ggrepel::geom_text_repel(aes(label=sample_id),size=2) +
  scale_color_manual(values=c("Bui"="#900C3F", "Can"="#FF5733", "Tur"="#FFC300",
                              Esp="#0FBA3D", "Jab"="#0FBA93", "Lla"="#0F8CBA"))+
  scale_fill_manual(values=c("Bui"="#900C3F", "Can"="#FF5733", "Tur"="#FFC300",
                              Esp="#0FBA3D", "Jab"="#0FBA93", "Lla"="#0F8CBA"))+
  facet_wrap(~population) +
  theme_minimal()
## Joining with `by = join_by(sample_id)`

we can just about see that the warm colours (central populations) are found on the left side of the plotting area and the green colours (southern) on the right side. So PC1 is splitting regions quite well. the Y axis (PC3) on the other hand is fairly consistently splitting the triangles from the squares (except for maybe in Tur). So we are getting a consistent treatment signal too.

Differential gene expression

we can now run the DESeq pipeline, which will automatically run a number of steps including controlling for different library sizes etc, and finally also fitting the generalized linear model.

dds <- DESeq(dds)
## estimating size factors
## using 'avgTxLength' from assays(dds), correcting for library size
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
dds1 <- DESeq(dds1)
## estimating size factors
## using 'avgTxLength' from assays(dds), correcting for library size
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
dds2 <- DESeq(dds2)
## estimating size factors
## using 'avgTxLength' from assays(dds), correcting for library size
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing

Run Wald’s test comparing high vs low treatments for each of the two regions

resultsNames(dds1) # there are many coefficients (effects), due to the population replicates, but we are not really interested in all of these
## [1] "regioncentral"            "regionsouth"             
## [3] "regioncentral.pop_n2"     "regionsouth.pop_n2"      
## [5] "regioncentral.pop_n3"     "regionsouth.pop_n3"      
## [7] "regioncentral.treatmentL" "regionsouth.treatmentL"
# compares Low vs High (high is the reference level for treatment) in the central region
res_treat_central<-results(dds1, name="regioncentral.treatmentL")  
summary(res_treat_central, alpha=0.05)
## 
## out of 17987 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up)       : 88, 0.49%
## LFC < 0 (down)     : 5, 0.028%
## outliers [1]       : 124, 0.69%
## low counts [2]     : 3104, 17%
## (mean count < 5)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
# compares Low vs High (high is the reference level for treatment) in the central region
res_treat_south<-results(dds1, name="regionsouth.treatmentL")  
summary(res_treat_south, alpha=0.05)
## 
## out of 17987 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up)       : 443, 2.5%
## LFC < 0 (down)     : 540, 3%
## outliers [1]       : 124, 0.69%
## low counts [2]     : 3445, 19%
## (mean count < 7)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
# lets combine them into a list

res_dds<-list("central"=res_treat_central,
              "south"=res_treat_south)

Relatively low numbers of DEGs. We should diagnose our models

examine results

# lets look at the P-value distributions
par(mfrow=c(2,1))
for(i in 1:length(res_dds)){
  hist(res_dds[[i]]$pvalue, breaks=40, col="grey", main=names(res_dds)[i])
}

par(mfrow=c(1,1))

They look OK

# plot
res_dds %>%
  lapply(as_tibble,rownames = "gene_id") %>%
  bind_rows(.id="population") %>%
  drop_na(padj) %>% # drop all genes with NAs
  filter(padj<0.1) %>%
  mutate(updown=ifelse(log2FoldChange>0, "up", "down")) %>%
  group_by(population, updown) %>%
  summarise(n=n()) %>%
  mutate(n=ifelse(updown=="down", n*-1, n)) %>%
  ggplot(aes(x=population, y=n, fill=updown)) +
  geom_bar(stat="identity") +
  theme_bw() +
  theme(legend.position = "none")
## `summarise()` has grouped output by 'population'. You can override using the
## `.groups` argument.

Local dispersion fits

in general, p values should be evenly distributed, with an inflation of p=0. Common “bad” distributions include “U-shaped” distributions or “hill shaped” distributions. The present distribution is not bad, but we can see if it would be better using a local fit instead of a parametric fit for the dispersion.

# Whether a gene is called significant depends not only on its LFC but also on its within-group variability, which DESeq2 quantifies as the dispersion.
## we can inspect the dispersion of a (default) parametric fit vs a local fit
disp.par <- estimateDispersions(dds1, fitType = "parametric")
## found already estimated dispersions, replacing these
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
disp.loc <- estimateDispersions(dds1, fitType = "local")
## found already estimated dispersions, replacing these
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
par(mfrow=c(2,1))
plotDispEsts(disp.par, main= "dispEst: parametric")
plotDispEsts(disp.loc, main= "dispEst: local")

par(mfrow=c(1,1))


### calculate median of absolute residuals
median(abs(log(mcols(disp.par)$dispGeneEst) - log(mcols(disp.par)$dispFit)))
## [1] 1.020538
median(abs(log(mcols(disp.loc)$dispGeneEst) - log(mcols(disp.loc)$dispFit)))
## [1] 0.7595821
## it seems like the local fit is a little better (lower mean absolute residuals, suggesting that that the distance of the residuals to the best fit line is lower)
### we can calculate the new results using local fit type

dds.loc<-DESeq(dds1, fitType = "local")
## using pre-existing normalization factors
## estimating dispersions
## found already estimated dispersions, replacing these
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
resultsNames(dds.loc)
## [1] "regioncentral"            "regionsouth"             
## [3] "regioncentral.pop_n2"     "regionsouth.pop_n2"      
## [5] "regioncentral.pop_n3"     "regionsouth.pop_n3"      
## [7] "regioncentral.treatmentL" "regionsouth.treatmentL"
# compares Low vs High (high is the reference level for treatment) in the central region
res_loc_treat_central<-results(dds.loc, name="regioncentral.treatmentL")  
summary(res_loc_treat_central, alpha=0.05)
## 
## out of 17987 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up)       : 93, 0.52%
## LFC < 0 (down)     : 6, 0.033%
## outliers [1]       : 124, 0.69%
## low counts [2]     : 2080, 12%
## (mean count < 2)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
# compares Low vs High (high is the reference level for treatment) in the central region
res_loc_treat_south<-results(dds.loc, name="regionsouth.treatmentL")  
summary(res_loc_treat_south, alpha=0.05)
## 
## out of 17987 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up)       : 478, 2.7%
## LFC < 0 (down)     : 562, 3.1%
## outliers [1]       : 124, 0.69%
## low counts [2]     : 3104, 17%
## (mean count < 5)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
# lets combine them into a list

res_dds_loc<-list("central"=res_loc_treat_central,
              "south"=res_loc_treat_south)

par(mfrow=c(2,1))
for(i in 1:length(res_dds_loc)){
  hist(res_dds_loc[[i]]$pvalue, breaks=40, col="grey", main=names(res_dds_loc)[i])
}

par(mfrow=c(1,1))

In general, very similar results, though perhaps a slight improvement

# plot
res_dds_loc %>%
  lapply(as_tibble,rownames = "gene_id") %>%
  bind_rows(.id="population") %>%
  drop_na(padj) %>% # drop all genes with NAs
  filter(padj<0.1) %>%
  mutate(updown=ifelse(log2FoldChange>0, "up", "down")) %>%
  group_by(population, updown) %>%
  summarise(n=n()) %>%
  mutate(n=ifelse(updown=="down", n*-1, n)) %>%
  ggplot(aes(x=population, y=n, fill=updown)) +
  geom_bar(stat="identity") +
  theme_bw() +
  theme(legend.position = "none")
## `summarise()` has grouped output by 'population'. You can override using the
## `.groups` argument.

FDR based on empirical null-model

From the pvalue distributions, we can say that perhaps our data is not fitting the default negative binomial Wald test assumptions. Under this test, the p-value distribution is expected to be flat with an enrichment in low p-values. As we don’t recover this distribution, we have to conclude that the default DESeq analysis is overestimating the variance in our data.

To alleviate this, we have to instead estimate our own null distribution which we can do with the fdrtool package. You can read more about this here: https://seqqc.wordpress.com/2016/01/27/too-few-or-none-differential-expressed-genes-a-way-to-fix-the-null-hypothesis-in-deseq2/

# first we have to filter out all genes with NA p-values (fdrtool doesnt like those) and then run fdrtools

res_fdr_dds<-res_dds
res_fdrtools<-list()

for(i in 1:length(res_fdr_dds)){
  
  # 1. remove genes that have been filtered out by DESeq2's independent filtering
  res_fdr_dds[[i]]<-res_fdr_dds[[i]][ !is.na(res_fdr_dds[[i]]$padj), ]
  # 2. remove genes that have been identified by DESeq2 as outliers
  res_fdr_dds[[i]]<-res_fdr_dds[[i]][ !is.na(res_fdr_dds[[i]]$pvalue), ]
  # 3. use z-scores as input to FDRtool to re-estimate the p-value
  res_fdrtools[[i]]<-fdrtool(res_fdr_dds[[i]]$stat, statistic= "normal", plot = F)
  
  # 4.  overwrite old with new fdr values
  res_fdr_dds[[i]]$padj  <- p.adjust(res_fdrtools[[i]]$pval, method = "BH")
}
## Step 1... determine cutoff point
## Step 2... estimate parameters of null distribution and eta0
## Step 3... compute p-values and estimate empirical PDF/CDF
## Step 4... compute q-values and local fdr
## 
## Step 1... determine cutoff point
## Step 2... estimate parameters of null distribution and eta0
## Step 3... compute p-values and estimate empirical PDF/CDF
## Step 4... compute q-values and local fdr

Lets look at these new p values

# New diagnostic plots 
par(mfrow=c(2,1))
for(i in 1:length(res_fdrtools)){
  hist(res_fdrtools[[i]]$pval, breaks=40, col="grey", main=names(res_dds)[i])
}

par(mfrow=c(1,1))

These look a little better, though notice the more even dispersion of pvalues (i.e. less relative enrichment of low p values)

## lets see how many DEGs there are now
lapply(res_fdr_dds, summary, alpha=0.05)
## 
## out of 14759 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up)       : 57, 0.39%
## LFC < 0 (down)     : 3, 0.02%
## outliers [1]       : 0, 0%
## low counts [2]     : 0, 0%
## (mean count < 5)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
## 
## 
## out of 14418 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up)       : 0, 0%
## LFC < 0 (down)     : 0, 0%
## outliers [1]       : 0, 0%
## low counts [2]     : 0, 0%
## (mean count < 7)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
## $central
## NULL
## 
## $south
## NULL

We have essentially lost all DEGs for south, and most of north. Seems very strange

Compare overlap

Rather than just the numbers of genes, we also want to see how many shared sets of genes the different populations have in their sets of DEGs.

# make function to extract lists of DEGs per population
extract_degs<-function(x) {
  return(
    x %>%
      as_tibble(rownames = "gene") %>%
      filter(padj<0.05) %>%
      pull(gene)
  )
}

#  extract all
sig_deg<-lapply(res_dds, FUN=extract_degs)
names(sig_deg)
## [1] "central" "south"
str(sig_deg)
## List of 2
##  $ central: chr [1:93] "PECUL23A000127" "PECUL23A000194" "PECUL23A000652" "PECUL23A001362" ...
##  $ south  : chr [1:983] "PECUL23A000011" "PECUL23A000059" "PECUL23A000253" "PECUL23A000300" ...
# Plot Upset

upset(fromList(sig_deg),
      nsets = length(sig_deg),
      keep.order = T,
      nintersects = 100,
      number.angles = 0, point.size = 3, line.size = 1,
      sets.x.label = "Number of DEGs",
      set_size.show = TRUE,
      set_size.scale_max = max(sapply(sig_deg, length))+50, # needed only to expand the axis a bit
      text.scale = c(1.2, 1.2, 1.2, 1.2, 1.5, 1.5),
      sets.bar.color = c("skyblue","chartreuse3"),
      order.by=c("degree","freq"))

Most DEGs are unique to regions, but we get some genes that are overlapping.

We can also plot this as Venn diagrams, seeing as there are only two sets

library(ggVennDiagram)
library(scico)

sig_deg %>%
  ggVennDiagram(edge_size = 0) +
  scale_fill_scico(palette = "batlow") +
  theme(legend.position="none")

see direction of expression

# DEGS only differentially expressed in the central populations
assay(vst_dds1)[sig_deg$central[!sig_deg$central %in% intersect(sig_deg$central,sig_deg$south)],] %>%
  as_tibble(rownames="gene_id") %>%
  pivot_longer(-gene_id, names_to="sample_id", values_to = "vst") %>%
  left_join(des.mat) %>%
  ggplot(aes(x=treatment, y=vst)) +
  stat_summary(aes(group = gene_id, color=gene_id), fun.y = mean, geom = "path") +
  #stat_summary(aes(color = species), fun.data = mean_cl_boot, geom = "errorbar", width = 0.1) +
  stat_summary(aes(color = gene_id), fun.y = mean, geom = "point", size = 2) +
#  geom_point() +
  facet_wrap(~population) +
  scale_y_log10() +
  theme(legend.position = "none")
## Joining with `by = join_by(sample_id)`
## Warning: The `fun.y` argument of `stat_summary()` is deprecated as of ggplot2 3.3.0.
## ℹ Please use the `fun` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# DEGS only differentially expressed in the south populations
assay(vst_dds1)[sig_deg$south[!sig_deg$south %in% intersect(sig_deg$central,sig_deg$south)],] %>%
  as_tibble(rownames="gene_id") %>%
  pivot_longer(-gene_id, names_to="sample_id", values_to = "vst") %>%
  left_join(des.mat) %>%
  ggplot(aes(x=treatment, y=vst)) +
  stat_summary(aes(group = gene_id, color=gene_id), fun.y = mean, geom = "path") +
  #stat_summary(aes(color = species), fun.data = mean_cl_boot, geom = "errorbar", width = 0.1) +
  stat_summary(aes(color = gene_id), fun.y = mean, geom = "point", size = 2) +
#  geom_point() +
  facet_wrap(~population) +
  scale_y_log10() +
  theme(legend.position = "none")
## Joining with `by = join_by(sample_id)`

The differences in slopes between genes for the corresponding sets are not immediately obvious in my opinion

Export data

Lets export the results with the default null distribution and only light pre-filtering

# make a results folder if it does not yet exist
dir.create("results", showWarnings = FALSE)

# save best DESeq2 object
saveRDS(dds1, "./results/deseq2_regions_dds.rds")

# save results object
saveRDS(res_dds,
        "./results/deseq2_regions_results.rds")